import pandas as pd
pd.options.mode.chained_assignment = None
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import os
import plotly.express as px
os.listdir()
['MoviesOnStreamingPlatforms_updated.csv', '.jovianrc', '.ipynb_checkpoints', 'Screen Shot 2020-10-20 at 7.05.03 PM.png', 'streaming-movies.ipynb']
raw_movies_df = pd.read_csv('MoviesOnStreamingPlatforms_updated.csv', index_col=1)
raw_movies_df.drop(columns='Unnamed: 0', inplace=True)
raw_movies_df.head()
| Title | Year | Age | IMDb | Rotten Tomatoes | Netflix | Hulu | Prime Video | Disney+ | Type | Directors | Genres | Country | Language | Runtime | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| ID | |||||||||||||||
| 1 | Inception | 2010 | 13+ | 8.8 | 87% | 1 | 0 | 0 | 0 | 0 | Christopher Nolan | Action,Adventure,Sci-Fi,Thriller | United States,United Kingdom | English,Japanese,French | 148.0 |
| 2 | The Matrix | 1999 | 18+ | 8.7 | 87% | 1 | 0 | 0 | 0 | 0 | Lana Wachowski,Lilly Wachowski | Action,Sci-Fi | United States | English | 136.0 |
| 3 | Avengers: Infinity War | 2018 | 13+ | 8.5 | 84% | 1 | 0 | 0 | 0 | 0 | Anthony Russo,Joe Russo | Action,Adventure,Sci-Fi | United States | English | 149.0 |
| 4 | Back to the Future | 1985 | 7+ | 8.5 | 96% | 1 | 0 | 0 | 0 | 0 | Robert Zemeckis | Adventure,Comedy,Sci-Fi | United States | English | 116.0 |
| 5 | The Good, the Bad and the Ugly | 1966 | 18+ | 8.8 | 97% | 1 | 0 | 1 | 0 | 0 | Sergio Leone | Western | Italy,Spain,West Germany | Italian | 161.0 |
Make the column names uniform
raw_movies_df.columns
Index(['Title', 'Year', 'Age', 'IMDb', 'Rotten Tomatoes', 'Netflix', 'Hulu',
'Prime Video', 'Disney+', 'Type', 'Directors', 'Genres', 'Country',
'Language', 'Runtime'],
dtype='object')
EX) Rename columns: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.rename.html df.rename(columns={"A": "a", "B": "c"})
raw_movies_df.columns = map(str.lower, raw_movies_df.columns)
Making all column names lower case for ease of use. Can also use: df.rename(str.lower, axis='columns')
raw_movies_df = raw_movies_df.rename(columns={'rotten tomatoes':'rotten_tomatoes','prime video': 'prime_video'})
raw_movies_df
| title | year | age | imdb | rotten_tomatoes | netflix | hulu | prime_video | disney+ | type | directors | genres | country | language | runtime | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| ID | |||||||||||||||
| 1 | Inception | 2010 | 13+ | 8.8 | 87% | 1 | 0 | 0 | 0 | 0 | Christopher Nolan | Action,Adventure,Sci-Fi,Thriller | United States,United Kingdom | English,Japanese,French | 148.0 |
| 2 | The Matrix | 1999 | 18+ | 8.7 | 87% | 1 | 0 | 0 | 0 | 0 | Lana Wachowski,Lilly Wachowski | Action,Sci-Fi | United States | English | 136.0 |
| 3 | Avengers: Infinity War | 2018 | 13+ | 8.5 | 84% | 1 | 0 | 0 | 0 | 0 | Anthony Russo,Joe Russo | Action,Adventure,Sci-Fi | United States | English | 149.0 |
| 4 | Back to the Future | 1985 | 7+ | 8.5 | 96% | 1 | 0 | 0 | 0 | 0 | Robert Zemeckis | Adventure,Comedy,Sci-Fi | United States | English | 116.0 |
| 5 | The Good, the Bad and the Ugly | 1966 | 18+ | 8.8 | 97% | 1 | 0 | 1 | 0 | 0 | Sergio Leone | Western | Italy,Spain,West Germany | Italian | 161.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 16740 | The Ghosts of Buxley Hall | 1980 | NaN | 6.2 | NaN | 0 | 0 | 0 | 1 | 0 | Bruce Bilson | Comedy,Family,Fantasy,Horror | United States | English | 120.0 |
| 16741 | The Poof Point | 2001 | 7+ | 4.7 | NaN | 0 | 0 | 0 | 1 | 0 | Neal Israel | Comedy,Family,Sci-Fi | United States | English | 90.0 |
| 16742 | Sharks of Lost Island | 2013 | NaN | 5.7 | NaN | 0 | 0 | 0 | 1 | 0 | Neil Gelinas | Documentary | United States | English | NaN |
| 16743 | Man Among Cheetahs | 2017 | NaN | 6.6 | NaN | 0 | 0 | 0 | 1 | 0 | Richard Slater-Jones | Documentary | United States | English | NaN |
| 16744 | In Beaver Valley | 1950 | NaN | NaN | NaN | 0 | 0 | 0 | 1 | 0 | James Algar | Documentary,Short,Family | United States | English | 32.0 |
16744 rows × 15 columns
raw_movies_df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 16744 entries, 1 to 16744 Data columns (total 15 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 title 16744 non-null object 1 year 16744 non-null int64 2 age 7354 non-null object 3 imdb 16173 non-null float64 4 rotten_tomatoes 5158 non-null object 5 netflix 16744 non-null int64 6 hulu 16744 non-null int64 7 prime_video 16744 non-null int64 8 disney+ 16744 non-null int64 9 type 16744 non-null int64 10 directors 16018 non-null object 11 genres 16469 non-null object 12 country 16309 non-null object 13 language 16145 non-null object 14 runtime 16152 non-null float64 dtypes: float64(2), int64(6), object(7) memory usage: 2.0+ MB
raw_movies_df['rotten_tomatoes'] = raw_movies_df['rotten_tomatoes'].str.replace('%', '')
raw_movies_df.head()
| title | year | age | imdb | rotten_tomatoes | netflix | hulu | prime_video | disney+ | type | directors | genres | country | language | runtime | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| ID | |||||||||||||||
| 1 | Inception | 2010 | 13+ | 8.8 | 87 | 1 | 0 | 0 | 0 | 0 | Christopher Nolan | Action,Adventure,Sci-Fi,Thriller | United States,United Kingdom | English,Japanese,French | 148.0 |
| 2 | The Matrix | 1999 | 18+ | 8.7 | 87 | 1 | 0 | 0 | 0 | 0 | Lana Wachowski,Lilly Wachowski | Action,Sci-Fi | United States | English | 136.0 |
| 3 | Avengers: Infinity War | 2018 | 13+ | 8.5 | 84 | 1 | 0 | 0 | 0 | 0 | Anthony Russo,Joe Russo | Action,Adventure,Sci-Fi | United States | English | 149.0 |
| 4 | Back to the Future | 1985 | 7+ | 8.5 | 96 | 1 | 0 | 0 | 0 | 0 | Robert Zemeckis | Adventure,Comedy,Sci-Fi | United States | English | 116.0 |
| 5 | The Good, the Bad and the Ugly | 1966 | 18+ | 8.8 | 97 | 1 | 0 | 1 | 0 | 0 | Sergio Leone | Western | Italy,Spain,West Germany | Italian | 161.0 |
#converting to number:
#run as float first!
raw_movies_df['rotten_tomatoes'] = raw_movies_df['rotten_tomatoes'].astype('float')
#raw_movies_df['rotten_tomatoes'] = raw_movies_df['rotten_tomatoes'].astype('int64')
#"int64" will not work because of the NaN values, but "Int64" does.
raw_movies_df['rotten_tomatoes'] = raw_movies_df['rotten_tomatoes'].astype('Int64')
raw_movies_df['rotten_tomatoes']
ID
1 87
2 87
3 84
4 96
5 97
...
16740 <NA>
16741 <NA>
16742 <NA>
16743 <NA>
16744 <NA>
Name: rotten_tomatoes, Length: 16744, dtype: Int64
raw_movies_df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 16744 entries, 1 to 16744 Data columns (total 15 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 title 16744 non-null object 1 year 16744 non-null int64 2 age 7354 non-null object 3 imdb 16173 non-null float64 4 rotten_tomatoes 5158 non-null Int64 5 netflix 16744 non-null int64 6 hulu 16744 non-null int64 7 prime_video 16744 non-null int64 8 disney+ 16744 non-null int64 9 type 16744 non-null int64 10 directors 16018 non-null object 11 genres 16469 non-null object 12 country 16309 non-null object 13 language 16145 non-null object 14 runtime 16152 non-null float64 dtypes: Int64(1), float64(2), int64(6), object(6) memory usage: 2.1+ MB
Now for the dataset!
movie_df = raw_movies_df.copy()
movie_df.describe()
| year | imdb | rotten_tomatoes | netflix | hulu | prime_video | disney+ | type | runtime | |
|---|---|---|---|---|---|---|---|---|---|
| count | 16744.000000 | 16173.000000 | 5158.000000 | 16744.000000 | 16744.000000 | 16744.000000 | 16744.000000 | 16744.0 | 16152.000000 |
| mean | 2003.014035 | 5.902751 | 65.428461 | 0.212613 | 0.053930 | 0.737817 | 0.033684 | 0.0 | 93.413447 |
| std | 20.674321 | 1.347867 | 26.614496 | 0.409169 | 0.225886 | 0.439835 | 0.180419 | 0.0 | 28.219222 |
| min | 1902.000000 | 0.000000 | 2.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 1.000000 |
| 25% | 2000.000000 | 5.100000 | 44.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 82.000000 |
| 50% | 2012.000000 | 6.100000 | 71.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 0.0 | 92.000000 |
| 75% | 2016.000000 | 6.900000 | 88.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 0.0 | 104.000000 |
| max | 2020.000000 | 9.300000 | 100.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 0.0 | 1256.000000 |
#missing data
total = movie_df.isnull().sum().sort_values(ascending=False)
percent = (movie_df.isnull().sum()/movie_df.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data
| Total | Percent | |
|---|---|---|
| rotten_tomatoes | 11586 | 0.691949 |
| age | 9390 | 0.560798 |
| directors | 726 | 0.043359 |
| language | 599 | 0.035774 |
| runtime | 592 | 0.035356 |
| imdb | 571 | 0.034102 |
| country | 435 | 0.025979 |
| genres | 275 | 0.016424 |
| type | 0 | 0.000000 |
| disney+ | 0 | 0.000000 |
| prime_video | 0 | 0.000000 |
| hulu | 0 | 0.000000 |
| netflix | 0 | 0.000000 |
| year | 0 | 0.000000 |
| title | 0 | 0.000000 |
# is there way to do the chart above, more simply?
#they are removing rotten tomatoes b/c most of the data is missing. I want to explore why it is missing.
# of movies per platform, 2 different ways:
#using the same idea as with booleans, True=1, False=0, we can just sum the columns:
# movie_df['Netflix'].sum()
#or
netflix_count = len(movie_df[movie_df['netflix'] == 1].index) #why .index? "The index (row labels) of the DataFrame." additionally it is faster, both do the same thing: https://stackoverflow.com/questions/15943769/how-do-i-get-the-row-count-of-a-pandas-dataframe
hulu_count = len(movie_df[movie_df['hulu'] == 1].index)
prime_count = len(movie_df[movie_df['prime_video'] == 1].index)
disney_count = len(movie_df[movie_df['disney+'] == 1].index)
Making a new DF with these counts:
label = ['Netflix', 'Hulu', 'Prime Video', 'Disney']
count = [netflix_count, hulu_count, prime_count, disney_count]
platform = pd.DataFrame(
{'platform': label,
'movie_count': count,
})
platform
| platform | movie_count | |
|---|---|---|
| 0 | Netflix | 3560 |
| 1 | Hulu | 903 |
| 2 | Prime Video | 12354 |
| 3 | Disney | 564 |
fig = px.pie(platform, names='platform', values='movie_count')
fig.update_traces(rotation=45, pull=[0.1,0.03,0.03,0.03,0.03], textinfo='percent+label', title='Movie count per platform')
fig.show()
yearly_count = movie_df.groupby('year')['title'].count().reset_index().rename(columns={'title':"movie_count"})
fig = px.bar(yearly_count, x='year', y="movie_count", color='movie_count', height=600)
fig.show()
# average ratings:
movie_df['genres'].value_counts().head(5)
Drama 1341 Documentary 1229 Comedy 1040 Comedy,Drama 446 Horror 436 Name: genres, dtype: int64
top5_genres = ['Drama','Documentary','Comedy', 'Comedy,Drama','Horror']
table = movie_df.loc[:, ['year', 'genres', 'imdb']]
table['avg_rating'] = table.groupby([table.genres, table.year])['imdb'].transform('mean')
table.drop('imdb', axis=1, inplace=True)
table = table[(table.year > 2009) & (table.year<2020)]
table = table.loc[table['genres'].isin(top5_genres)]
table = table.sort_values('year')
fig = px.bar(table, x='genres', y='avg_rating', animation_frame='year', animation_group='genres',
color='genres', hover_name='genres', range_y=[0,10])
fig.update_layout(showlegend=False)
fig.show()
top_movies_imdb = movie_df[movie_df['imdb']>8.5][['title', 'directors', 'rotten_tomatoes', 'imdb']].sort_values(by=['rotten_tomatoes'], ascending=False)
top_movies_imdb
| title | directors | rotten_tomatoes | imdb | |
|---|---|---|---|---|
| ID | ||||
| 4474 | Stop Making Sense | Jonathan Demme | 100 | 8.6 |
| 4663 | Tom Petty and the Heartbreakers: Runnin' Down ... | Peter Bogdanovich | 100 | 8.6 |
| 3563 | Parasite | Bong Joon Ho | 99 | 8.6 |
| 5 | The Good, the Bad and the Ugly | Sergio Leone | 97 | 8.8 |
| 3562 | GoodFellas | Martin Scorsese | 96 | 8.7 |
| ... | ... | ... | ... | ... |
| 11161 | Draw Hard | Jon Nix | <NA> | 8.7 |
| 11208 | Along Recovery | Justin J. Springer | <NA> | 8.6 |
| 11874 | Ruby's Studio: the Feelings Show | Jason Docter,Matt Docter | <NA> | 8.8 |
| 11911 | Second Chance By Betrayal | Terrence Weasel Smith | <NA> | 8.7 |
| 15176 | Shred: The Story of Asher Bradshaw | Kathy Herndl | <NA> | 8.8 |
97 rows × 4 columns
top_movies_rt = movie_df[movie_df['rotten_tomatoes']>90.0][['title', 'directors', 'rotten_tomatoes', 'imdb']].sort_values(by=['rotten_tomatoes'], ascending=False)
top_movies_rt
| title | directors | rotten_tomatoes | imdb | |
|---|---|---|---|---|
| ID | ||||
| 16481 | Marvel Rising: Secret Warriors | Alfred Gimeno,Eric Radomski | 100 | 5.2 |
| 1332 | Attacking the Devil: Harold Evans and the Last... | David Morris,Jacqui Morris | 100 | 8.1 |
| 5214 | Top Spin | Sara Newens,Mina T. Son | 100 | 6.9 |
| 5228 | Bolivia | Israel Adrián Caetano | 100 | 7.3 |
| 5229 | The Story of Louis Pasteur | William Dieterle | 100 | 7.3 |
| ... | ... | ... | ... | ... |
| 3948 | All Square | John Hyams | 91 | 5.8 |
| 6111 | Werewolf | Freddie Francis | 91 | 6.0 |
| 6082 | Michael H. – Profession: Director | Yves Montmayeur | 91 | 7.0 |
| 5084 | Cutter's Way | Ivan Passer | 91 | 6.9 |
| 959 | Nobody Speak: Trials of the Free Press | Brian Knappenberger | 91 | 6.5 |
1035 rows × 4 columns
Comparing top imdb to top RT. not much overlap.
top_movies = top_movies_imdb.index & top_movies_rt.index
top_movies
Int64Index([4474, 4663, 3563, 5, 3562, 3561, 4440, 4460, 16214, 16213], dtype='int64', name='ID')
movie_df.head()
| title | year | age | imdb | rotten_tomatoes | netflix | hulu | prime_video | disney+ | type | directors | genres | country | language | runtime | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| ID | |||||||||||||||
| 1 | Inception | 2010 | 13+ | 8.8 | 87 | 1 | 0 | 0 | 0 | 0 | Christopher Nolan | Action,Adventure,Sci-Fi,Thriller | United States,United Kingdom | English,Japanese,French | 148.0 |
| 2 | The Matrix | 1999 | 18+ | 8.7 | 87 | 1 | 0 | 0 | 0 | 0 | Lana Wachowski,Lilly Wachowski | Action,Sci-Fi | United States | English | 136.0 |
| 3 | Avengers: Infinity War | 2018 | 13+ | 8.5 | 84 | 1 | 0 | 0 | 0 | 0 | Anthony Russo,Joe Russo | Action,Adventure,Sci-Fi | United States | English | 149.0 |
| 4 | Back to the Future | 1985 | 7+ | 8.5 | 96 | 1 | 0 | 0 | 0 | 0 | Robert Zemeckis | Adventure,Comedy,Sci-Fi | United States | English | 116.0 |
| 5 | The Good, the Bad and the Ugly | 1966 | 18+ | 8.8 | 97 | 1 | 0 | 1 | 0 | 0 | Sergio Leone | Western | Italy,Spain,West Germany | Italian | 161.0 |
movies_by_country = movie_df.groupby('country')['title'].count().reset_index().sort_values('title',ascending = False).head(10).rename(columns = {'title':'movie_count'})
fig = px.pie(movies_by_country,names='country', values='movie_count')
fig.update_traces(rotation=180, pull=[0.1,0.03,0.03,0.03,0.03],textinfo="percent+label", title='Movie Count per Country')
fig.update_layout(showlegend=False)
fig.show()
def movies_count(platform, count=False):
if count==False:
print('Movies in {} are {}'. format(platform, movie_df[platform].sum()))
else:
return movie_df[platform].sum()
labels = 'netflix', 'hulu', 'prime_video', 'disney'
size = [movies_count('netflix', count=True),
movies_count('hulu', count=True),
movies_count('prime_video', count=True),
movies_count('disney+', count=True)]
explode = (0.1, 0.1, 0.1, 0.1)
#plotting
fig1, ax1 = plt.subplots()
ax1.pie(size,
labels = labels,
autopct = '%1.1f%%',
explode = explode,
shadow = True,
startangle = 100)
ax1.axis = ('equal')
plt.show()
is the above double counting movies that list 2 countries??, no it doesen't seem so but its lieaving out stuff them.
#runtime
sns.displot(movie_df['runtime']);
sns.displot(movie_df['imdb']);
#sns.distplot(movie_df['rotten_tomatoes']);
#ValueError: cannot convert to 'float64'-dtype NumPy array with missing values. Specify an appropriate 'na_value' for this dtype.
movie_df.imdb.describe()
count 16173.000000 mean 5.902751 std 1.347867 min 0.000000 25% 5.100000 50% 6.100000 75% 6.900000 max 9.300000 Name: imdb, dtype: float64
#language
count_by_language = movie_df.groupby('language')['title'].count().reset_index().sort_values('title', ascending=False).head(10).rename(columns={'title': 'movie_count'})
fig = px.bar(count_by_language, x='language', y='movie_count', color='movie_count', height=600)
fig.show()
plt.figure(figsize=(15,7))
chains=movie_df['language'].value_counts()[:20]
sns.barplot(x=chains, y=chains.index, palette='Set2')
plt.title('Languages most commonly made', size=20, pad=20)
plt.xlabel('counts', size=15);
(movie_df['language'] == 'English').sum()
10955
len(movie_df.language)
16744
missing_data.loc[missing_data.index == 'language']
| Total | Percent | |
|---|---|---|
| language | 599 | 0.035774 |
turn above into a new table for each with percent each language
#Runtime
top_runtime = movie_df.sort_values('runtime', ascending=False).head(10)
fig = px.bar(top_runtime, x='title', y='runtime', color='runtime', height=600)
fig.show()
top_directors = movie_df.groupby('directors')['title'].count().reset_index().rename(columns={'title': 'movie_count'}).sort_values('movie_count', ascending=False).head(10)
fig = px.bar(top_directors, x='directors', y='movie_count', color='movie_count', height=600)
fig.show()
plt.figure(figsize=(15,7))
chains=movie_df['directors'].value_counts()[:20]
sns.barplot(x=chains, y=chains.index, palette='Set1')
plt.title('Most movies made by director', size=20, pad=20)
plt.xlabel('counts', size=15);
top_genres = movie_df.groupby('genres')['title'].count().reset_index().rename(columns={'title': 'movie_count'}).sort_values('movie_count', ascending=False).head(10)
fig = px.bar(top_genres, x='genres', y='movie_count', color='movie_count', height=600)
fig.show()
plt.figure(figsize=(15,7))
chains=movie_df['genres'].value_counts()[:20]
sns.barplot(x=chains, y=chains.index, palette='Set2')
plt.title('Genre Counts', size=20, pad=20)
plt.xlabel('counts', size=15);
movie_df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 16744 entries, 1 to 16744 Data columns (total 15 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 title 16744 non-null object 1 year 16744 non-null int64 2 age 7354 non-null object 3 imdb 16173 non-null float64 4 rotten_tomatoes 5158 non-null Int64 5 netflix 16744 non-null int64 6 hulu 16744 non-null int64 7 prime_video 16744 non-null int64 8 disney+ 16744 non-null int64 9 type 16744 non-null int64 10 directors 16018 non-null object 11 genres 16469 non-null object 12 country 16309 non-null object 13 language 16145 non-null object 14 runtime 16152 non-null float64 dtypes: Int64(1), float64(2), int64(6), object(6) memory usage: 2.1+ MB
sns.heatmap(movie_df.isnull())
<AxesSubplot:ylabel='ID'>
movie_df.isnull().sum()
title 0 year 0 age 9390 imdb 571 rotten_tomatoes 11586 netflix 0 hulu 0 prime_video 0 disney+ 0 type 0 directors 726 genres 275 country 435 language 599 runtime 592 dtype: int64
print("# Age Null: {}".format(movie_df.age.isnull().sum()))
# Age Null: 9390
movie_df.age.value_counts(), movie_df.age.shape
(18+ 3474 7+ 1462 13+ 1255 all 843 16+ 320 Name: age, dtype: int64, (16744,))
sns.countplot(x='age', data=movie_df)
<AxesSubplot:xlabel='age', ylabel='count'>
sns.countplot(x='netflix', data=movie_df);
sns.pairplot(movie_df) fig=plt.gcf() fig.set_size_inches(20,20)
sns.scatterplot(x='year', y='runtime', hue='age', data=movie_df)
fig=plt.gcf()
fig.set_size_inches(8,8)
sns.displot(movie_df['year'])
<seaborn.axisgrid.FacetGrid at 0x7f93fd91c1d0>
plt.figure(figsize=(15,7))
chains=movie_df['country'].value_counts()[:20]
sns.barplot(x=chains, y=chains.index, palette='Set3')
plt.title('Movies made by country', size=20, pad=20)
plt.xlabel('counts', size=15);
sns.displot(movie_df['imdb'], bins=20);
sns.displot(movie_df['imdb'], bins=50);
sns.displot(movie_df['imdb'], bins=10);
def data_inv(df):
print('No of Rows: ', df.shape[0])
print('No of Coloums: ', df.shape[1])
print('**'*25)
print('Colums Names: \n', df.columns)
print('**'*25)
print('Datatype of Columns: \n', df.dtypes)
print('**'*25)
print('Missing Values: ')
c = df.isnull().sum()
c = c[c>0]
print(c)
print('**'*25)
print('Missing vaules %age wise:\n')
print((100*(df.isnull().sum()/len(df.index))))
print('**'*25)
print('Pictorial Representation:')
plt.figure(figsize=(8,6))
sns.heatmap(df.isnull(), yticklabels=False,cbar=False)
plt.show()
data_inv(movie_df)
No of Rows: 16744
No of Coloums: 15
**************************************************
Colums Names:
Index(['title', 'year', 'age', 'imdb', 'rotten_tomatoes', 'netflix', 'hulu',
'prime_video', 'disney+', 'type', 'directors', 'genres', 'country',
'language', 'runtime'],
dtype='object')
**************************************************
Datatype of Columns:
title object
year int64
age object
imdb float64
rotten_tomatoes Int64
netflix int64
hulu int64
prime_video int64
disney+ int64
type int64
directors object
genres object
country object
language object
runtime float64
dtype: object
**************************************************
Missing Values:
age 9390
imdb 571
rotten_tomatoes 11586
directors 726
genres 275
country 435
language 599
runtime 592
dtype: int64
**************************************************
Missing vaules %age wise:
title 0.000000
year 0.000000
age 56.079790
imdb 3.410177
rotten_tomatoes 69.194935
netflix 0.000000
hulu 0.000000
prime_video 0.000000
disney+ 0.000000
type 0.000000
directors 4.335882
genres 1.642379
country 2.597946
language 3.577401
runtime 3.535595
dtype: float64
**************************************************
Pictorial Representation:
dropping all null data rows: movies.dropna(subset=['Directors', 'Genres', 'Country', 'Language', 'Runtime'],inplace=True)
movies.drop_duplicates(inplace=True)
movie_df['title'].value_counts()
Saleslady 1
The Cape Town Affair 1
Upside Down 1
Blackbelt 1
Adam Ferrara: Funny As Hell 1
..
Ill Manors 1
The Devil and Father Amorth 1
Return of the Street Fighter 1
Mucize 1
Lang Tong 1
Name: title, Length: 16744, dtype: int64
#Writing a function to calculate the movies in different platforms
def movies_count(platform, count=False):
if count==False:
print('Movies in {} are {}'. format(platform,movie_df[platform].sum()))
else:
return movie_df[platform].sum()
movies_count('netflix')
movies_count('hulu')
movies_count('prime_video')
movies_count('disney+')
Movies in netflix are 3560 Movies in hulu are 903 Movies in prime_video are 12354 Movies in disney+ are 564
Directors: Directors, in this column, some directors are present with a ',' so I'll split the names by ',' & then stack it one after the other for easy analysis. Then I wanted to find the director which has the maximum number of movies, to accomplish this, I have set a threshold (10) & plotted the directors which directed more than 10 movies.
from pandas import Series
s = movie_df['directors'].str.split(',').apply(Series, 1).stack()
s.index = s.index.droplevel(-1)
s.name = 'directors'
del movie_df['directors']
df_directors = movie_df.join(s)
data = df_directors['directors'].value_counts()
threshold = 10
prob = data > threshold
data_new = data.loc[prob]
plt.figure(figsize=(10,10))
data_new.plot(kind='bar')
plt.show()
s = movie_df['genres'].str.split(',').apply(Series, 1).stack()
s.index = s.index.droplevel(-1)
s.name = 'genres'
del movie_df['genres']
df_genres = movie_df.join(s)
df_genres.head()
| title | year | age | imdb | rotten_tomatoes | netflix | hulu | prime_video | disney+ | type | country | language | runtime | genres | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| ID | ||||||||||||||
| 1 | Inception | 2010 | 13+ | 8.8 | 87 | 1 | 0 | 0 | 0 | 0 | United States,United Kingdom | English,Japanese,French | 148.0 | Action |
| 1 | Inception | 2010 | 13+ | 8.8 | 87 | 1 | 0 | 0 | 0 | 0 | United States,United Kingdom | English,Japanese,French | 148.0 | Adventure |
| 1 | Inception | 2010 | 13+ | 8.8 | 87 | 1 | 0 | 0 | 0 | 0 | United States,United Kingdom | English,Japanese,French | 148.0 | Sci-Fi |
| 1 | Inception | 2010 | 13+ | 8.8 | 87 | 1 | 0 | 0 | 0 | 0 | United States,United Kingdom | English,Japanese,French | 148.0 | Thriller |
| 2 | The Matrix | 1999 | 18+ | 8.7 | 87 | 1 | 0 | 0 | 0 | 0 | United States | English | 136.0 | Action |
plt.figure(figsize=(10,10))
sns.countplot(x='genres', data=df_genres)
plt.xticks(rotation=90)
plt.show()
s = movie_df['country'].str.split(',').apply(Series, 1).stack()
s.index = s.index.droplevel(-1)
s.name = 'country'
del movie_df['country']
df_country = movie_df.join(s)
df_country['country'].value_counts()[:10].plot(kind='bar')
plt.show()
plt.figure(figsize=(15,7))
chains=df_country['country'].value_counts()[:20]
sns.barplot(x=chains, y=chains.index, palette='Set3')
plt.title('Movies made by country', size=20, pad=20)
plt.xlabel('counts', size=15);
s = movie_df['language'].str.split(',').apply(Series, 1).stack()
s.index = s.index.droplevel(-1)
s.name = 'language'
del movie_df['language']
df_language = movie_df.join(s)
plt.figure(figsize=(15,7))
chains=df_language['language'].value_counts()[:20]
sns.barplot(x=chains, y=chains.index, palette='Set3')
plt.title('Language', size=20, pad=20)
plt.xlabel('counts', size=15);
plt.figure(figsize=(15,7))
chains=raw_movies_df['language'].value_counts()[:20]
sns.barplot(x=chains, y=chains.index, palette='Set3')
plt.title('Language', size=20, pad=20)
plt.xlabel('counts', size=15);